{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e3e2a59a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !unzip all_data.csv.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "32e0f787",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import random\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "59867e66",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1999516, 46)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('all_data.csv')\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "6f373c88",
   "metadata": {},
   "outputs": [],
   "source": [
    "rejected = [\n",
    "    'severe_toxicity',\n",
    "    'wow',\n",
    "    'identity_annotator_count',\n",
    "    'toxicity_annotator_count',\n",
    "    'likes',\n",
    "    'other_gender',\n",
    "    'other_sexual_orientation',\n",
    "    'other_race_or_ethnicity',\n",
    "    'other_religion',\n",
    "    'other_disability',\n",
    "    'split',\n",
    "    'id',\n",
    "    'rating',\n",
    "    'article_id',\n",
    "    'white',\n",
    "    'created_date',\n",
    "    'black',\n",
    "    'sad',\n",
    "    'funny',\n",
    "    'disagree',\n",
    "    'publication_id',\n",
    "    'parent_id'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "8fbc6277",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>female</th>\n",
       "      <th>sexual_explicit</th>\n",
       "      <th>transgender</th>\n",
       "      <th>identity_attack</th>\n",
       "      <th>toxicity</th>\n",
       "      <th>threat</th>\n",
       "      <th>jewish</th>\n",
       "      <th>asian</th>\n",
       "      <th>latino</th>\n",
       "      <th>intellectual_or_learning_disability</th>\n",
       "      <th>...</th>\n",
       "      <th>comment_text</th>\n",
       "      <th>physical_disability</th>\n",
       "      <th>insult</th>\n",
       "      <th>bisexual</th>\n",
       "      <th>muslim</th>\n",
       "      <th>obscene</th>\n",
       "      <th>hindu</th>\n",
       "      <th>psychiatric_or_mental_illness</th>\n",
       "      <th>heterosexual</th>\n",
       "      <th>male</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.014925</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.373134</td>\n",
       "      <td>0.014925</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>He got his money... now he lies in wait till a...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.343284</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.089552</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.013158</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.092105</td>\n",
       "      <td>0.605263</td>\n",
       "      <td>0.065789</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>Mad dog will surely put the liberals in mental...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.565789</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.065789</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.047619</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>And Trump continues his lifelong cowardice by ...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.031746</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.592105</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.815789</td>\n",
       "      <td>0.105263</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>\"while arresting a man for resisting arrest\".\\...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.684211</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.552632</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.275000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.037500</td>\n",
       "      <td>0.550000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>Tucker and Paul are both total bad ass mofo's.</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.487500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.337500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 24 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   female  sexual_explicit  transgender  identity_attack  toxicity    threat  \\\n",
       "0     NaN         0.014925          NaN         0.000000  0.373134  0.014925   \n",
       "1     NaN         0.013158          NaN         0.092105  0.605263  0.065789   \n",
       "2     NaN         0.000000          NaN         0.047619  0.666667  0.000000   \n",
       "3     NaN         0.592105          NaN         0.000000  0.815789  0.105263   \n",
       "4     NaN         0.275000          NaN         0.037500  0.550000  0.000000   \n",
       "\n",
       "   jewish  asian  latino  intellectual_or_learning_disability  ...  \\\n",
       "0     NaN    NaN     NaN                                  NaN  ...   \n",
       "1     NaN    NaN     NaN                                  NaN  ...   \n",
       "2     NaN    NaN     NaN                                  NaN  ...   \n",
       "3     NaN    NaN     NaN                                  NaN  ...   \n",
       "4     NaN    NaN     NaN                                  NaN  ...   \n",
       "\n",
       "                                        comment_text  physical_disability  \\\n",
       "0  He got his money... now he lies in wait till a...                  NaN   \n",
       "1  Mad dog will surely put the liberals in mental...                  NaN   \n",
       "2  And Trump continues his lifelong cowardice by ...                  NaN   \n",
       "3  \"while arresting a man for resisting arrest\".\\...                  NaN   \n",
       "4     Tucker and Paul are both total bad ass mofo's.                  NaN   \n",
       "\n",
       "     insult  bisexual muslim   obscene  hindu  psychiatric_or_mental_illness  \\\n",
       "0  0.343284       NaN    NaN  0.089552    NaN                            NaN   \n",
       "1  0.565789       NaN    NaN  0.065789    NaN                            NaN   \n",
       "2  0.666667       NaN    NaN  0.031746    NaN                            NaN   \n",
       "3  0.684211       NaN    NaN  0.552632    NaN                            NaN   \n",
       "4  0.487500       NaN    NaN  0.337500    NaN                            NaN   \n",
       "\n",
       "   heterosexual  male  \n",
       "0           NaN   NaN  \n",
       "1           NaN   NaN  \n",
       "2           NaN   NaN  \n",
       "3           NaN   NaN  \n",
       "4           NaN   NaN  \n",
       "\n",
       "[5 rows x 24 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = df[set(df.columns.tolist()) - set(rejected)]\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "7263aeaf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['female', 'sexual_explicit', 'transgender', 'identity_attack',\n",
       "       'toxicity', 'threat', 'jewish', 'asian', 'latino',\n",
       "       'intellectual_or_learning_disability', 'christian', 'atheist',\n",
       "       'homosexual_gay_or_lesbian', 'buddhist', 'comment_text',\n",
       "       'physical_disability', 'insult', 'bisexual', 'muslim', 'obscene',\n",
       "       'hindu', 'psychiatric_or_mental_illness', 'heterosexual', 'male'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "75f50cb0",
   "metadata": {},
   "outputs": [],
   "source": [
    "selected_columns = set(df.iloc[0].index) - {'comment_text'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "e0de6a43",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "X1, X2, Y = [], [], []\n",
    "for i in tqdm(range(len(df))):\n",
    "    row = df.iloc[i]\n",
    "    pos = list(row[selected_columns][row[selected_columns] > 0.7].index)\n",
    "    for p in pos:\n",
    "        p = p.replace('_', ' ')\n",
    "        X1.append(row['comment_text'])\n",
    "        X2.append(f'teks ini berkaitan {p}')\n",
    "        Y.append(1)\n",
    "    \n",
    "len(X1), len(X2), len(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "5d662023",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "_, X1, _, X2, _, Y = train_test_split(X1, X2, Y, test_size = 50000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "f90edcb1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('positive-toxicity.json', 'w') as fopen:\n",
    "    json.dump({\n",
    "        'X1': X1,\n",
    "        'X2': X2,\n",
    "        'Y': Y\n",
    "    }, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "c7f12a1e",
   "metadata": {},
   "outputs": [],
   "source": [
    "X1, X2, Y = [], [], []\n",
    "for i in tqdm(range(len(df))):\n",
    "    row = df.iloc[i]\n",
    "    \n",
    "    if random.random() > 0.1:\n",
    "        continue\n",
    "    \n",
    "    neg = list(row[selected_columns][row[selected_columns] < 0.05].index)\n",
    "    for p in neg:\n",
    "        p = p.replace('_', ' ')\n",
    "        X1.append(row['comment_text'])\n",
    "        X2.append(f'teks ini berkaitan {p}')\n",
    "        Y.append(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "d309099b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "50000"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "_, X1, _, X2, _, Y = train_test_split(X1, X2, Y, test_size = 50000)\n",
    "len(X1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "421681e8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('negative-toxicity.json', 'w') as fopen:\n",
    "    json.dump({\n",
    "        'X1': X1,\n",
    "        'X2': X2,\n",
    "        'Y': Y\n",
    "    }, fopen)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
